#PBS -N 03AB_sortmerna_fastqc
#PBS -A GT-wratcliff3
#PBS -q inferno
#PBS -j oe
#PBS -o logs/03AB_sortmerna_fastqc.out
    
# change to project directory
PROJECT_DIR=$PBS_O_WORKDIR
echo "Changing directory to ${PROJECT_DIR}…"
cd $PROJECT_DIR
     
# activate environment
PIPELINE_ENV=`basename scripts/*.yml .yml`
echo "Activating conda environment ${PIPELINE_ENV}…"
source activate $PIPELINE_ENV
    
# extract sample name
samplenames=(`cat raw_data/samplenames.txt`)
sampleindex=$PBS_ARRAYID
samplename=${samplenames[$(($sampleindex-1))]}
     
echo -e "\n########## Start processing sample ${sampleindex}: ${samplename} ##########\n"

echo -e "\n########## Starting SortMeRNA ##########\n"

# ref
sortmerna_rRNA_fasta=/storage/home/hcoda1/6/ktong34/p-wratcliff3-0/ngs_ref/ref_data/sortmerna_v4.2.0/rRNA_databases_fasta
sortmerna_rRNA_index=/storage/home/hcoda1/6/ktong34/p-wratcliff3-0/ngs_ref/ref_index/sortmerna_v4.2.0/rRNA_databases_index

# input/output
in=results/02A_trimgalore
in_R1=$in/${samplename}_paired_trimmed_R1.fastq.gz
in_R2=$in/${samplename}_paired_trimmed_R2.fastq.gz
out=results/03A_sortmerna
out_kvdb=$out/kvdb_$samplename  # kvdb must be empty before each run
out_out=$out/out
out_aligned_prefix=$out_out/${samplename}_rRNA_aligned
out_filtered_prefix=$out_out/${samplename}_rRNA_filtered
mkdir -p $out $out_kvdb $out_out

# re-compress files (only for paired trimmed reads)
for file in $in_R1 $in_R2; do gzip -d $file; gzip ${file%.gz}; done

# main
sortmerna \
--ref $sortmerna_rRNA_fasta/silva-bac-23s-id98.fasta \
--ref $sortmerna_rRNA_fasta/silva-bac-16s-id90.fasta \
--ref $sortmerna_rRNA_fasta/silva-arc-23s-id98.fasta \
--ref $sortmerna_rRNA_fasta/silva-arc-16s-id95.fasta \
--ref $sortmerna_rRNA_fasta/rfam-5s-database-id98.fasta \
--ref $sortmerna_rRNA_fasta/rfam-5.8s-database-id98.fasta \
--ref $sortmerna_rRNA_fasta/silva-euk-28s-id98.fasta \
--ref $sortmerna_rRNA_fasta/silva-euk-18s-id95.fasta \
--reads $in_R1 --reads $in_R2 \
--idx $sortmerna_rRNA_index \
--kvdb $out_kvdb \
--fastx \
--aligned $out_aligned_prefix \
--other $out_filtered_prefix \
--paired_in \
--out2 \
--threads $PBS_NP

# rename and compress files
for file in $out_out/${samplename}*.fastq; do mv $file `echo $file | sed 's/fwd./R1./' | sed 's/rev./R2./'`; done
gzip $out_out/${samplename}*.fastq

echo -e "\n########## Starting FastQC of rRNA-filtered reads ##########\n"

# input/output
in=results/03A_sortmerna/out
in_R1=$in/${samplename}_rRNA_filtered_R1.fastq.gz
in_R2=$in/${samplename}_rRNA_filtered_R2.fastq.gz
out=results/03B_fastqc_rRNA_filtered
mkdir -p $out

# main
fastqc \
$in_R1 $in_R2 \
-o $out \
-t $PBS_NP

